kernel_path =  (File.expand_path(__FILE__).split("/"))[0..-2].join("/") + '/'
require kernel_path+'pdb_my_lib.rb'
require kernel_path+'log_lib'
require kernel_path+'env'
require kernel_path+'db_lib'

$bl2seq = get_path('bl2seq') if ! $bl2seq 
$seq_files = get_path('seq_files') if ! $seq_files
#-------------------------------------------------------------------
def get_string_stream str
	file = nil
	begin
		require 'stringio'
		file = StringIO.new(str)
	rescue
		f = File.new("tmp_instead_of_string_io","w")
		f.puts str
		f.flush
		f.close
		file = File.new("tmp_instead_of_string_io","r")
	end
	return file
end
#-------------------------------------------------------------------
def skip_io_until(io,str,ctrl_str='stringthatcannverexistinmostofourtexts')
	while( line = io.gets and ! line.include? str and ! line.include? ctrl_str) do end #skip 
	return line
end
#-------------------------------------------------------------------
def do_with_lines_pairs(filename=ARGV[0],limit=123456789, &block)
	file = File.new(filename,'r')
	counter = 0
	while line = file.gets do
		row1 = line
		row2 = file.gets
		yield row1, row2		
		counter = counter + 1
		break if counter > limit
	end
end
#-----------------------------------------------------------------------------------
def do_with_lines(filename=ARGV[0],limit=123456789, &block)
	file = File.new(filename,'r')
	counter = 0
	while line = file.gets do
		yield line		
		counter = counter + 1
		break if counter > limit
		#puts counter.to_s if(counter%1023==1)
	end
end

#-------------------------------------------------------------------
#Will parse blastp output file (in fmt 0) looking for:
#         #Query=  1vpd_A mol:protein length:299  TARTRONATE SEMIALDEHYDE REDUCTASE
#each query block will be given to &block
def iterate_blast_queries(blast_file, limit, &block)
	limit = 1234565789 if (!limit or limit<1)
	file = File.new(blast_file,'r')
	counter = 0
	query = []
	while( line = file.gets )
		if line.include? 'Query=' #starting new chain
			yield query.join("") unless ( query.empty? or counter < 1)
			query = []
			counter = counter+1
			#puts counter.to_s if(counter%1023==1)	#heartbeat
		end
			query << line.to_s 
		
		break if counter > limit
	end #while
	yield query.join("") 
	return counter
end
#-------------------------------------------------------------------
#As every query may have many hits, we parse each one of them and call &block
#	#> 1vpd_A mol:protein length:299  TARTRONATE SEMIALDEHYDE REDUCTASE
def iterate_hits_of_single_query q, &block
	file = get_string_stream q
	counter = 0
	hit = []
	h_name='nil'
	while line = file.gets
		if line.include? '>' #starting new hit
			yield hit.join(""),h_name  unless counter < 1
			h_name= line.split()[0].split('>')[-1]
			counter = counter + 1
			hit = []
		end
		hit << line.to_s
	end #while
	yield hit.join(""),h_name if h_name != 'nil'
end
#---------------------------------------------------
#Hit may contain several parts. Every part starts with
	#Score =  598 bits (1543),  Expect = 2e-171, Method: Compositional matrix adjust.
	#Identities = 299/299 (100%), Positives = 299/299 (100%), Gaps = 0/299 (0%)
#and has an alignment block (see parser for alignment blocks)

def parse_hit(r_name, h)
	file = get_string_stream h
	array_of_hits = []
	while true
		hit_hash = {}
		hit_hash[:r_name] = r_name
		
		line = skip_io_until(file,'Score') 
		return array_of_hits if line == nil
		#Score =  598 bits (1543),  Expect = 2e-171, Method: Compositional matrix adjust.
		hit_hash[:evalue] = line.split(",")[1].split(" ")[2]
		
		line = skip_io_until(file,'Identities')
		# Identities = 20/74 (27%), Positives = 33/74 (44%), Gaps = 1/74 (1%)
		hit_hash[:gaps] = 0
		arr = line.split(",")
		hit_hash[:ident] = arr[0].split("(")[1].split("%")[0] if line.include? 'Identities'
		hit_hash[:positives] = arr[1].split("(")[1].split("%")[0] if line.include? 'Positives'
		hit_hash[:gaps] = arr[2].split("(")[1].split("%")[0] if line.include? 'Gaps'
			
		align_data = parse_alignment_block(file) #hit_hash[:length].to_i)
		hit_hash[:left_q]=align_data[0]
		hit_hash[:right_q]=align_data[1]
		hit_hash[:left_s]=align_data[2]
		hit_hash[:right_s]=align_data[3]
		hit_hash[:seq_q] = align_data[4]
		hit_hash[:seq_s]=align_data[5]
		hit_hash[:length] = length_of_alignment(hit_hash[:seq_q].split(''),hit_hash[:seq_s].split(''))
		array_of_hits << hit_hash
	end
end

#---------------------------------------------------
#---------------------------------------------------
#	Query  1    MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAKSELDKAIGRNTNGV  60
#		    MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLN   AAKSELDKAIGRNTNGV
#	Sbjct  1    MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLN---AAKSELDKAIGRNTNGV  57
def parse_alignment_block(file)
	seq_q = []
	seq_s = []
	line = skip_io_until(file,'Query') 
	while line do
		parts = line.split
		left_q = parts[1].to_i if !left_q
		right_q = parts[3].to_i 
		seq_q << parts[2] 
		
		line = skip_io_until(file,'Sbjct') 
		parts = line.split
		left_s = parts[1].to_i if !left_s
		right_s = parts[3].to_i 
		seq_s << parts[2]
		
		line = skip_io_until(file,'Query','Score') 
		line = nil if !line or line.include? 'Score'
	end
	return [left_q.to_s, right_q.to_s, left_s.to_s, right_s.to_s, seq_q.join(""),seq_s.join("")]
end

def length_of_alignment(q,s)
	counter = 0
	i = 0
	q.size.times do 
		counter +=1 if q[i]!= '-' and s[i]!='-'
		i+=1
	end
	return counter
end

#-------------------------------------------------------------------
def print_pair(query_name,hit_hash)
	pair = [query_name,hit_hash[:r_name]].sort #sort lex. the names of chains
	return if pair[0]==pair[1] #we don't want self-similarity
	left_q = hit_hash[:left_q]
	right_q = hit_hash[:right_q]
	left_s = hit_hash[:left_s]
	right_s = hit_hash[:right_s]
	seq_q = hit_hash[:seq_q]
	seq_s = hit_hash[:seq_s]
	#but, alas, the sorting creates the following problem - we maybe have to swap q and s data:
	if query_name==pair[1] #swap
		left_q = hit_hash[:left_s]
		right_q = hit_hash[:right_s]
		left_s = hit_hash[:left_q]
		right_s = hit_hash[:right_q]
		seq_q = hit_hash[:seq_s]
		seq_s = hit_hash[:seq_q]
	end
	log_me "#{left_q},#{right_q},#{left_s},#{right_s},#{seq_q},#{seq_s},#{hit_hash[:length]},#{hit_hash[:evalue]},#{hit_hash[:ident].to_s},#{hit_hash[:positives].to_s},#{hit_hash[:gaps].to_s}"
end
#-------------------------------------------------------------------

def parse_multi_blast(fname)
	iterate_blast_queries(fname, -1) { |q| 
		q_name = q.split(" ")[1]
		iterate_hits_of_single_query(q) { |h,p_name| 
#			 p_name = h.split(" ")[1]
			 #insert_pair(q_name, parse_hit(p_name,h) )
			 print_pair(q_name,parse_hit(p_name,h)[0])
		}
	}
end
#---------------------------------------------------
def parse_hit_header(h)
	file = get_string_stream h
	line = skip_io_until(file,'Identities')
	# Identities = 20/74 (27%), Positives = 33/74 (44%), Gaps = 1/74 (1%)
	return line.split('/')[1].split()[0].strip
end


def parse_multi_blast_fix_l(fname)
	iterate_blast_queries(fname, -1) { |q| 
		q_name = q.split(" ")[1]
		iterate_hits_of_single_query(q) { |h,p_name| 
			 #p_name = h[0].split(" ")[1]
			 
			 puts "#{q_name},#{p_name},#{parse_hit_header(h)}"
		}
	}
end


#---------------------------------------------------
# def preproc_xyz	
# 	path_unzipped= "/home/snepomny/structures/data/open/"
# 	set_w_to_file "/tmp/warn.log"
#  	set_e_to_file "/tmp/errors.log"
# 	counter = 0
#  	Dir["/home/snepomny/structures/data/open/*.ent"].each do |path| #get_some_chain_names(100).
# 		begin 
# 			get_xyz_for_all_chains path if Dir[path + ".A"].empty?
# 					# convert_verb_xyz_to_simple_xyz path if ! Dir[path + ".?"].empty?#path_unzipped + pair[0] # get_xyz_for_chain(path_unzipped + pair[0],pair[1]) 
# 			
# 			counter = counter+1
# 			puts counter.to_s  if counter % 1000 == 1
# 		rescue
# 			puts path #err_log "exception  in #{path}!" 
# 		end
# 	end
# end


#-------------------------------------------------------------------

#get_xyz_for_chain(ARGV[0],ARGV[1]).each do |pair|
#	print pair[1]
#end

#convert_verb_xyz_to_simple_xyz ARGV[0]
#puts get_verb_xyz_residues ARGV[0]


#-------------------------------------------------------------------
#index into blast alignment data:
def index_blast_qs_aligned(q,s,offset_q,offset_s)
	#puts "oq: #{offset_q} os: #{offset_s}"
	dash = "-"[0]
	raise "Bad aligned \n #{q} \n #{s}" if q.size != s.size
	index_q = []
	index_s = []
	(0..q.size-1).each do |i|
		if q[i]!=dash and s[i]!=dash #common case:
			index_q << offset_q
			index_s << offset_s
			offset_q += 1
			offset_s += 1
		elsif q[i]==dash and s[i]!=dash
			offset_s += 1
		elsif q[i]!=dash and s[i]==dash
			offset_q += 1
		else #shouldn't happen
			raise "#{__FILE__}:#{__LINE__} two dashes happened"
			offset_q += 1
			offset_s += 1
		end #ehd if
	end #end loop
	return [index_q, index_s]
end

#-------------------------------------------------------------------
def find_to_the_right(arr,leftmost,value)
	return -1 if leftmost >= arr.size or arr[leftmost] > value #this is sorted array
	arr[leftmost..-1].each do |a|
		return 	leftmost if arr[leftmost] == value
		leftmost = leftmost + 1		
	end
	return arr.size
end
#-------------------------------------------------------------------
#Given a list and pattern - two arrays of growing moton. naturals
#replace with -1 all list[i] where list[i] not in pattern
def filter_index_list_with_pattern(list,pattern)
	#throw "Pattern bigger than list #{list}, #{pattern}" if list.size < pattern.size
	right = 0
	psize = pattern.size
	result = []
	list.each do |value|
		p_ind = find_to_the_right(pattern, right, value)
		if p_ind >= 0 and p_ind < psize
			result << value
			right = p_ind
		else 
			result << -1
		end
	end
	return result	
end
#-------------------------------------------------------------------
#For any Blast hit query-subject will use seqres (fasta) data, alignment by Blast and xyz indexing of atoms (ca) 
#to return 2 aligned and valid xyz structures[]
def xyz_by_alignment_after_filtering(   blast_q, blast_s, blast_offset_q, blast_offset_s, 
					seq_q, seq_s, atoms_q, atoms_s, xyz_q, xyz_s)
# #######[]
#   1) Blast aligns residues taken from seqres section
#   2) Blast isn't using all the residues - there is sometimes starting offset and 'change'
#   3) Blast may insert '-' in query or subject residue strings we should skip those
#   4) In atoms section some residues lack 'CA' - we will have to filter them out from alignment too
#   
#   So the plan is:
# 	* make SEQ_ALI_S (will contain indices of subject residues) and SEQ_ALI_Q  (will contain indices of query residues) - this is taking care of places with '-'. |SEQ_ALI_S| == |SEQ_ALI_Q|
# 	* make ATOM_CA_S (will contain indices of subject residues that had 'ca' in atom section) and ATOM_CA_Q (will contain indices of query residues that had 'ca' in atom section) 
# 	* filter out SEQ_ALI_S with ATOM_CA_S and SEQ_ALI_Q with ATOM_CA_Q
# 	* go over FIL_SEQ_ALI_S and FIL_SEQ_ALI_Q  to eliminate all the misses cause by removal of 'ca' from one of them
# 	* use SEQ_ALI_S and SEQ_ALI_Q to index XYZ_S and XYZ_Q arrays and get ALI_XYZ_S and ALI_XYZ_Q
# 	* run RMSD (or whatever) on ALI_XYZ_S and ALI_XYZ_Q
# ####
	arr =  index_blast_qs_aligned(blast_q,blast_s,blast_offset_q,blast_offset_s)				
	seq_ali_q = arr[0] #index blast answer into seqres 
	seq_ali_s = arr[1] #index blast answer into seqres
	raise "|seq_ali_q| != |seq_ali_s| " if seq_ali_q.size != seq_ali_s.size
	#puts seq_ali_q.join("") puts "-------\n" puts seq_ali_s.join("") puts seq_ali_s.size.to_s
	
	atom_ca_q =  simple_align(seq_q,atoms_q,0,0) #index atoms into seqres
	atom_ca_s =  simple_align(seq_s,atoms_s,0,0) #index atoms into seqres
	
	#use index only if it is present in atoms
	fil_seq_ali_q = filter_index_list_with_pattern(seq_ali_q,atom_ca_q) 
	fil_seq_ali_s = filter_index_list_with_pattern(seq_ali_s,atom_ca_s)
	
	raise "|fil_seq_ali_q| != |fil_seq_ali_s| " if fil_seq_ali_q.size != fil_seq_ali_s.size
	
	infl_q = inflate_xyz_atoms_to_match_seqres(seq_q,atoms_q,xyz_q)
	final_q = get_xyz_from_inflated_by_index(infl_q,fil_seq_ali_q)
	infl_s = inflate_xyz_atoms_to_match_seqres(seq_s,atoms_s,xyz_s)
	final_s = get_xyz_from_inflated_by_index(infl_s,fil_seq_ali_s)
	return [final_q,final_s]

	
end
#-------------------------------------------------------------------
def unit_test
      puts (index_blast_qs_aligned('abcd','abcd',4,1).join(",") == "4,5,6,7,1,2,3,4")
      puts (index_blast_qs_aligned('a-cd','abcd',4,1).join(",") == "4,5,6,1,3,4")
      puts (index_blast_qs_aligned('abcd','a-cd',4,1).join(",") == "4,6,7,1,2,3")
      puts (index_blast_qs_aligned('a-cd','a-cd',4,1).join(",") == "4,6,7,1,3,4")
      list = [1,2,3,4,5,6,  9,10,11]
      arr =          [5,6,8,9]
      puts (find_to_the_right(arr,0,1) ==-1)
      puts (find_to_the_right(arr,1,6) == 1)
      puts (find_to_the_right(arr,2,9) == 3)
      puts (find_to_the_right(arr,1,10)== 4)
      puts (filter_index_list_with_pattern(list,arr).join(",")=="-1,-1,-1,-1,5,6,9,-1,-1")
end
#-------------------------------------------------------------------
def create_pair_xyz(pdb_q,chain_q,pdb_s,chain_s, seq_q, seq_s,blast_q,blast_s,blast_offset_q,blast_offset_s)

# 	#>2az3_H mol:protein length:1
# 	seq_q=
# 	'GSHMTDHDERTFVMVKPDGVQRGLIGDIVTRLETKGLKMVGGKFMRIDEELAHEHYAEHEDKPFFDGLVSFITSGPVFAMVWEGADATRQVRQLMGATDAQDAAPGTIRGDYGNDLGHNLIHGSDHEDEGANEREIALFFDDDELVDWDRDASAWVYEDLADHD'.split("")
# 
# 	#>2vu5_A mol:protein length:149
# 	seq_s= 'MEKTFLMVKPDGVQRAFIGEIVARFEKKGFQLVGAKLMQVTPEIAGQHYAEHEEKPFFGELVDFITSGPVFAMVWQGEGVVDTARNMMGKTRPHEAAPGTIRGDFGVTVAKNIIHGSDSLESAEREIGIFFKEEELVDYSKLMNEWIY'.split("")

	#seq_q = []
	#seq_s = []
	path = get_path('open')
	atoms_q = get_verb_xyz_residues  "#{path}pdb#{pdb_q}.ent.#{chain_q}"
	atoms_s = get_verb_xyz_residues  "#{path}pdb#{pdb_s}.ent.#{chain_s}"
	xyz_q = get_xyz_from_dot3("#{path}pdb#{pdb_q}.ent.#{chain_q}.3")
	xyz_s = get_xyz_from_dot3("#{path}pdb#{pdb_s}.ent.#{chain_s}.3")

arr = xyz_by_alignment_after_filtering(   blast_q, blast_s, blast_offset_q, blast_offset_s, 
					seq_q, seq_s, atoms_q, atoms_s, xyz_q, xyz_s)	
end
# -------------------------------------------------------------------
def dump_two_xyzs_to_file(l_name,r_name,xyzs_arr)
	lf = File.new(l_name+"-"+r_name+".hit.L",'w')
	rf = File.new(l_name+"-"+r_name+".hit.R",'w')
	#lf.puts xyzs_arr[0].size.to_s 
	lf.puts "#comment"
	#rf.puts xyzs_arr[1].size.to_s 
	rf.puts "#comment"
	
	xyzs_arr[0].each do |xyz|
		lf.puts xyz.join(" ")
	end
	xyzs_arr[1].each do |xyz|
		rf.puts xyz.join(" ")
	end
	lf.flush
	rf.flush
	lf.close
	rf.close
		
end
# -------------------------------------------------------------------
def xyz_from_blast_hit l_name,r_name,blast_offset_q,blast_offset_s,blast_q,blast_s
	# l_name,r_name,left_q,left_s,seq_q,seq_s
	#puts "#{l_name},#{r_name}"
	pdb_q = l_name.split("_")[0]
	pdb_s = r_name.split("_")[0]  
	chain_q = l_name.split("_")[1]
	chain_s = r_name.split("_")[1]  
	
	seq_q = (get_fasta_by_name l_name).split("")
	seq_s = (get_fasta_by_name r_name).split("")
	
	#(pdb_q,chain_q, pdb_s, chain_s, blast_q,blast_s,blast_offset_q,blast_offset_s)
	two_xyzs = create_pair_xyz(pdb_q,chain_q, pdb_s, chain_s, seq_q, seq_s, blast_q,blast_s,blast_offset_q.to_i-1,blast_offset_s.to_i-1)
	raise "xyzs differ in size!" if two_xyzs[0].size != two_xyzs[1].size
	dump_two_xyzs_to_file(l_name,r_name,two_xyzs)
	
end

#-------------------------------------------------------------------
#will read pairs of chains. from file
#if blast information is not given for any pair will run bl2seq 
#will use blast data to syntesize two aligned xyz structures for TmScore and run it
def blast_pairs_from_file fname='',limit=1
	f = File.new(fname,"r")
	counter = 1
	while (line = f.gets)
		arr =  line.split(" ")
		q_name = arr[0]
		s_name = arr[1]
		
		cmd ="#{$bl2seq} -i #{$seq_files}/#{q_name.sub("_","")}.seq -j #{$seq_files}/#{s_name.sub("_","")}.seq -p blastp  -F F > blast.tmp "
		#puts cmd
		%x[#{cmd}]
		
		iterate_blast_queries('blast.tmp', 1){ |q| 
			iterate_hits_of_single_query(q) { |h| 
				hsh = parse_hit(s_name,h)[0]
				next if !hsh				
				log_me "#{q_name} #{s_name} #{hsh[:left_q]} #{hsh[:left_s]} #{hsh[:seq_s]} #{hsh[:seq_s]} #{hsh[:right_q]} #{hsh[:right_s]} #{hsh[:evalue]} #{hsh[:ident]} #{hsh[:positives]} #{hsh[:length]}"

			}
		}
		break if limit < counter += 1
	end
end
#-------------------------------------------------------------------
# #unit_test
# #print index_blast_qs_aligned(q,s,2,9).join(",")
# blast_s = 'EKTFLMVKPDGVQRAFIGEIVARFEKKGFQLVGAKLMQVTPEIAGQHYAEHEEKPFFGELVDFITSGPVFAMVWQGEGVVDTARNMMGKTRPHEAAPGTIRGDFGVTVAKNIIHGSDSLE--SAEREIGIFFKEEELVDYSKLMNEWIY'
# blast_q = 'ERTFVMVKPDGVQRGLIGDIVTRLETKGLKMVGGKFMRIDEELAHEHYAEHEDKPFFDGLVSFITSGPVFAMVWEGADATRQVRQLMGATDAQDAAPGTIRGDYGNDLGHNLIHGSDHEDEGANEREIALFFDDDELVDWDRDASAWVY'
# blast_offset_s = 1
# blast_offset_q = 8



#HDERTFVMVKPDGVQRGLIGDIVTRLETKGLKMVGGKFMRIDEELAHEHYAEHEDKPFFDGLVSFITSGPVFAMVWEGADATRQVRQLMGATDAQDAAPGTIRGDYGNDLGHNLIHGSDHEDEGANEREIALFFDDDELVDWDRDASAWVYE'
#MEKTFLMVKPDGVQRAFIGEIVARFEKKGFQLVGAKLMQVTPEIAGQHYAEHEEKPFFGELVDFITSGPVFAMVWQGEGVVDTARNMMGKTRPHEAAPGTIRGDFGVTVAKNIIHGSDSLESAEREIGIFFKEEELVDYSKLMNEWIY
# 
# atoms_q = get_verb_xyz_residues  '/home/snepomny/structures/data/open/pdb2az3.ent.H'
# atoms_s = get_verb_xyz_residues  '/home/snepomny/structures/data/open/pdb2vu5.ent.A'
# xyz_q = get_xyz_from_dot3('/home/snepomny/structures/data/open/pdb2az3.ent.H.3')
# xyz_s = get_xyz_from_dot3('/home/snepomny/structures/data/open/pdb2vu5.ent.A.3')
# arr = xyz_by_alignment_after_filtering(   blast_q, blast_s, blast_offset_q, blast_offset_s, 
# 					seq_q.strip.split(""), seq_s.strip.split(""), atoms_q, atoms_s, xyz_q, xyz_s)
					

#print create_pair_xyz('2az3','H', '2vu5','A', blast_q,blast_s,blast_offset_q,blast_offset_s)
